{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 基于图的模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入包\n",
    "import random\n",
    "import math\n",
    "import numpy as np\n",
    "import time\n",
    "from tqdm import tqdm\n",
    "from scipy.sparse import csc_matrix, linalg, eye\n",
    "from copy import deepcopy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 一. 通用函数定义"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义装饰器，监控运行时间\n",
    "def timmer(func):\n",
    "    def wrapper(*args, **kwargs):\n",
    "        start_time = time.time()\n",
    "        res = func(*args, **kwargs)\n",
    "        stop_time = time.time()\n",
    "        print('Func %s, run time: %s' % (func.__name__, stop_time - start_time))\n",
    "        return res\n",
    "    return wrapper"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 数据处理相关\n",
    "1. load data\n",
    "2. split data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Dataset():\n",
    "    \n",
    "    def __init__(self, fp):\n",
    "        # fp: data file path\n",
    "        self.data = self.loadData(fp)\n",
    "    \n",
    "    @timmer\n",
    "    def loadData(self, fp):\n",
    "        data = []\n",
    "        for l in open(fp):\n",
    "            data.append(tuple(map(int, l.strip().split('::')[:2])))\n",
    "        return data\n",
    "    \n",
    "    @timmer\n",
    "    def splitData(self, M, k, seed=1):\n",
    "        '''\n",
    "        :params: data, 加载的所有(user, item)数据条目\n",
    "        :params: M, 划分的数目，最后需要取M折的平均\n",
    "        :params: k, 本次是第几次划分，k~[0, M)\n",
    "        :params: seed, random的种子数，对于不同的k应设置成一样的\n",
    "        :return: train, test\n",
    "        '''\n",
    "        train, test = [], []\n",
    "        random.seed(seed)\n",
    "        for user, item in self.data:\n",
    "            # 这里与书中的不一致，本人认为取M-1较为合理，因randint是左右都覆盖的\n",
    "            if random.randint(0, M-1) == k:  \n",
    "                test.append((user, item))\n",
    "            else:\n",
    "                train.append((user, item))\n",
    "\n",
    "        # 处理成字典的形式，user->set(items)\n",
    "        def convert_dict(data):\n",
    "            data_dict = {}\n",
    "            for user, item in data:\n",
    "                if user not in data_dict:\n",
    "                    data_dict[user] = set()\n",
    "                data_dict[user].add(item)\n",
    "            data_dict = {k: list(data_dict[k]) for k in data_dict}\n",
    "            return data_dict\n",
    "\n",
    "        return convert_dict(train), convert_dict(test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 评价指标\n",
    "1. Precision\n",
    "2. Recall\n",
    "3. Coverage\n",
    "4. Popularity(Novelty)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Metric():\n",
    "    \n",
    "    def __init__(self, train, test, GetRecommendation):\n",
    "        '''\n",
    "        :params: train, 训练数据\n",
    "        :params: test, 测试数据\n",
    "        :params: GetRecommendation, 为某个用户获取推荐物品的接口函数\n",
    "        '''\n",
    "        self.train = train\n",
    "        self.test = test\n",
    "        self.GetRecommendation = GetRecommendation\n",
    "        self.recs = self.getRec()\n",
    "        \n",
    "    # 为test中的每个用户进行推荐\n",
    "    def getRec(self):\n",
    "        recs = {}\n",
    "        for user in self.test:\n",
    "            rank = self.GetRecommendation(user)\n",
    "            recs[user] = rank\n",
    "        return recs\n",
    "        \n",
    "    # 定义精确率指标计算方式\n",
    "    def precision(self):\n",
    "        all, hit = 0, 0\n",
    "        for user in self.test:\n",
    "            test_items = set(self.test[user])\n",
    "            rank = self.recs[user]\n",
    "            for item, score in rank:\n",
    "                if item in test_items:\n",
    "                    hit += 1\n",
    "            all += len(rank)\n",
    "        return round(hit / all * 100, 2)\n",
    "    \n",
    "    # 定义召回率指标计算方式\n",
    "    def recall(self):\n",
    "        all, hit = 0, 0\n",
    "        for user in self.test:\n",
    "            test_items = set(self.test[user])\n",
    "            rank = self.recs[user]\n",
    "            for item, score in rank:\n",
    "                if item in test_items:\n",
    "                    hit += 1\n",
    "            all += len(test_items)\n",
    "        return round(hit / all * 100, 2)\n",
    "    \n",
    "    # 定义覆盖率指标计算方式\n",
    "    def coverage(self):\n",
    "        all_item, recom_item = set(), set()\n",
    "        for user in self.test:\n",
    "            for item in self.train[user]:\n",
    "                all_item.add(item)\n",
    "            rank = self.recs[user]\n",
    "            for item, score in rank:\n",
    "                recom_item.add(item)\n",
    "        return round(len(recom_item) / len(all_item) * 100, 2)\n",
    "    \n",
    "    # 定义新颖度指标计算方式\n",
    "    def popularity(self):\n",
    "        # 计算物品的流行度\n",
    "        item_pop = {}\n",
    "        for user in self.train:\n",
    "            for item in self.train[user]:\n",
    "                if item not in item_pop:\n",
    "                    item_pop[item] = 0\n",
    "                item_pop[item] += 1\n",
    "\n",
    "        num, pop = 0, 0\n",
    "        for user in self.test:\n",
    "            rank = self.recs[user]\n",
    "            for item, score in rank:\n",
    "                # 取对数，防止因长尾问题带来的被流行物品所主导\n",
    "                pop += math.log(1 + item_pop[item])\n",
    "                num += 1\n",
    "        return round(pop / num, 6)\n",
    "    \n",
    "    def eval(self):\n",
    "        metric = {'Precision': self.precision(),\n",
    "                  'Recall': self.recall(),\n",
    "                  'Coverage': self.coverage(),\n",
    "                  'Popularity': self.popularity()}\n",
    "        print('Metric:', metric)\n",
    "        return metric"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 二. PersonalRank算法实现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PersonalRank(train, alpha, N):\n",
    "    '''\n",
    "    :params: train, 训练数据\n",
    "    :params: alpha, 继续随机游走的概率\n",
    "    :params: N, 推荐TopN物品的个数\n",
    "    :return: GetRecommendation, 获取推荐结果的接口\n",
    "    ''' \n",
    "    \n",
    "    # 构建索引\n",
    "    items = []\n",
    "    for user in train:\n",
    "        items.extend(train[user])\n",
    "    id2item = list(set(items))\n",
    "    users = {u: i for i, u in enumerate(train.keys())}\n",
    "    items = {u: i+len(users) for i, u in enumerate(id2item)}\n",
    "    \n",
    "    # 计算转移矩阵（注意！！！要按照出度进行归一化）\n",
    "    item_user = {}\n",
    "    for user in train:\n",
    "        for item in train[user]:\n",
    "            if item not in item_user:\n",
    "                item_user[item] = []\n",
    "            item_user[item].append(user)\n",
    "            \n",
    "    data, row, col = [], [], []\n",
    "    for u in train:\n",
    "        for v in train[u]:\n",
    "            data.append(1 / len(train[u]))\n",
    "            row.append(users[u])\n",
    "            col.append(items[v])\n",
    "    for u in item_user:\n",
    "        for v in item_user[u]:\n",
    "            data.append(1 / len(item_user[u]))\n",
    "            row.append(items[u])\n",
    "            col.append(users[v])\n",
    "            \n",
    "    M = csc_matrix((data, (row, col)), shape=(len(data), len(data)))\n",
    "    \n",
    "    # 获取接口函数\n",
    "    def GetRecommendation(user):\n",
    "        seen_items = set(train[user])\n",
    "        # 解矩阵方程 r = (1-a)r0 + a(M.T)r\n",
    "        r0 = [0] * len(data)\n",
    "        r0[users[user]] = 1\n",
    "        r0 = csc_matrix(r0)\n",
    "        r = (1 - alpha) * linalg.inv(eye(len(data)) - alpha * M.T) * r0\n",
    "        r = r.T.toarray()[0][len(users):]\n",
    "        idx = np.argsort(-r)[:N]\n",
    "        recs = [(id2item[ii], r[ii]) for ii in idx]\n",
    "        return recs\n",
    "    \n",
    "    return GetRecommendation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 三. PersonalRank实验\n",
    "M=8, N=10, alpha=0.8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Experiment():\n",
    "    \n",
    "    def __init__(self, M, N, alpha, fp='../dataset/ml-1m/ratings.dat'):\n",
    "        '''\n",
    "        :params: M, 进行多少次实验\n",
    "        :params: N, TopN推荐物品的个数\n",
    "        :params: alpha, 继续随机游走的概率\n",
    "        :params: fp, 数据文件路径\n",
    "        '''\n",
    "        self.M = M\n",
    "        self.N = N\n",
    "        self.alpha = alpha\n",
    "        self.fp = fp\n",
    "        self.alg = PersonalRank\n",
    "    \n",
    "    # 定义单次实验\n",
    "    @timmer\n",
    "    def worker(self, train, test):\n",
    "        '''\n",
    "        :params: train, 训练数据集\n",
    "        :params: test, 测试数据集\n",
    "        :return: 各指标的值\n",
    "        '''\n",
    "        getRecommendation = self.alg(train, self.alpha, self.N)\n",
    "        metric = Metric(train, test, getRecommendation)\n",
    "        return metric.eval()\n",
    "    \n",
    "    # 多次实验取平均\n",
    "    @timmer\n",
    "    def run(self):\n",
    "        metrics = {'Precision': 0, 'Recall': 0, \n",
    "                   'Coverage': 0, 'Popularity': 0}\n",
    "        dataset = Dataset(self.fp)\n",
    "        for ii in range(self.M):\n",
    "            train, test = dataset.splitData(self.M, ii)\n",
    "            print('Experiment {}:'.format(ii))\n",
    "            metric = self.worker(train, test)\n",
    "            metrics = {k: metrics[k]+metric[k] for k in metrics}\n",
    "        metrics = {k: metrics[k] / self.M for k in metrics}\n",
    "        print('Average Result (M={}, N={}, ratio={}): {}'.format(\\\n",
    "                              self.M, self.N, self.ratio, metrics))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PersonalRank实验(笔记本跑的太慢，这里没贴实验结果)\n",
    "M, N, alpha = 8, 10, 0.8\n",
    "exp = Experiment(M, N, alpha)\n",
    "exp.run()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 四. 总结\n",
    "1. 可以用矩阵运算，直接找到最优解，而不需要真的一步一步去迭代。当然，循环迭代也可以实现，具体可参见书。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
